 /*
Purpose: 	Clean Census Boundary and Annexation Survey (BAS) Data
Author: 	Kiara Wyndham-Douds
Dataset: 	U.S. Dates of Municipal Incorporation 
Date:		July 6, 2022
*/

/*This file cleans data from Census Boundary and Annexation Survey (BAS) - https://www.census.gov/programs-surveys/bas.html.
BAS data tables are highly accurate for new incorporations since its inception 
in 1982 but do not contain consistent information for municipalities incorporated 
prior to this. 

This file cleans BAS data and then saves incorporations from 1987 on to supplement
Census of Government data (cleaned in incorp-clean01-cog.do).*/

**************************
***IMPORT 1980s
**************************

import excel "new_incs_1980-1989_forstata.xlsx", sheet("Sheet1") firstrow allstring
drop L M
*377 obs

*problem: All dates look right, except some that appear in different format. manual fix: 

replace effective_date= "11jun1711" if placefips =="53472"

replace effective_date= "13dec1819" if placefips =="55044"

replace effective_date= "01jan1980" if placefips =="69280"

replace effective_date= "01jan1848" if placefips =="00820"

replace effective_date= "01jan1853" if placefips =="16678"

replace effective_date= "14jan1845" if placefips =="07390"

replace effective_date= "25jan1833" if placefips =="16894"

replace effective_date= "01feb1838" if placefips =="64146"

replace effective_date= "14feb1866" if placefips =="51978"


*extract year of incorporation
gen year_incorp = substr(effective_date,6,9) 
tab year_incorp,m
label var year_incorp "year of incorporation BAS"

*drop obs missing year of incorporation
drop if year_incorp==""
*32 obs dropped

gen bas_decade=1980
label var bas_decade "decade included in BAS"

codebook placefips
*missing 1 placefips

* drop obs missing placefips
drop if placefips==""
*1 obs dropped


save bas_1980s_raw.dta, replace
clear

**************************
***IMPORT 1990s
**************************

import excel "new_incs_1990-1999_forstata.xlsx", sheet("Sheet1") firstrow allstring

*problem: All dates look right, except some that appear in different format. manual fix: 
sort effective_date

replace effective_date= "14aug1998" if placefips =="08665"

replace effective_date= "15dec1999" if placefips =="01457"

*extract year of incorporation
gen year_incorp = substr(effective_date,6,9) 
tab year_incorp,m
label var year_incorp "year of incorporation BAS"

*ANALYTIC SAMPLE: drop obs missing year of incorporation
drop if year_incorp==""
*1 obs dropped

gen bas_decade=1990
label var bas_decade "decade included in BAS"

save bas_1990s_raw.dta, replace
clear


**************************
***IMPORT 2000s
**************************

import excel "2000-2009entitychanges_forstata.xlsx", sheet("Incorporations") firstrow allstring
*182 obs

*note: dates with one-number months need leading zeros

*extract year of incorporation
*tell stata effective_date is a date
gen double effective_date_d = date(effective_date, "MDY") 
format effective_date_d %td

generate year_incorp=year(effective_date_d)
tostring year_incorp, replace
tab year_incorp,m

gen bas_decade=2000
label var bas_decade "decade included in BAS"

save bas_2000s_raw.dta, replace
clear

**************************
***IMPORT 2010s
**************************
import excel "2010-2019entitychanges_forstata.xlsx", sheet("New GUs") firstrow allstring
*74 obs

*extract year of incorporation
*tell stata effective_date is a date
gen double effective_date_d = date(effective_date, "MDY") 
format effective_date_d %td

generate year_incorp=year(effective_date_d)
tostring year_incorp, replace
tab year_incorp,m

gen bas_decade=2010
label var bas_decade "decade included in BAS"

*fix statefips
replace statefips="01" if statefips=="1"
replace statefips="02" if statefips=="2"
replace statefips="04" if statefips=="4"
replace statefips="05" if statefips=="5"
replace statefips="06" if statefips=="6"
replace statefips="08" if statefips=="8"
tab statefips,m

save bas_2010s_raw.dta, replace
clear

**************************
***APPEND ALL DECADES TOGETHER
**************************

use using bas_1980s_raw.dta
append using bas_1990s_raw.dta
append using bas_2000s_raw.dta
append using bas_2010s_raw.dta
*925 obs

tab year_incorp,m

tab bas_decade,m

save bas_1980_2019_raw.dta, replace
clear

**************************
***CREATE YEAR OF INCORP VAR
**************************

use bas_1980_2019_raw.dta

destring year_incorp, replace

rename placefips placefips_part
gen placefips = statefips + placefips
rename place_name_incorp place_name_bas

distinct placefips
*925 total, 905 distinct

*id duplciates
sort placefips bas_decade
quietly by placefips:  gen dup = cond(_N==1,0,_n)
tab dup
list placefips dup place_name_bas bas_decade if dup>0
*some places listed twice in BAS - keeping earlier
drop if dup>1
drop dup
distinct placefips

keep placefips place_name_bas year_incorp


drop if year_incorp<1987
tab year_incorp,m
*634 remaining

save bas_dec_incorp_1987_2019.dta, replace

**********************************
*MERGE IN NHGIS CODES SO CAN BE MATCHED ACROSS DECADES
**********************************

/*IPUMS NHGIS has created a place code identifer - NHGISPLACE  - that is consistent
over time. (https://www.nhgis.org/documentation/gis-data/place-points#identifiers)
I use the NHGIS Place Point 1990-2015 files, which contain 1990-2015 
place FIPS codes and the NHGISPLACE codes. These matches will allow this data 
file to be matched to other place codes in other time periods. */

/* Strategy: Start wtih 2015 and merge in earlier decades until all place codes in 
BAS are matched to NHGISPLACE codes.*/

******2015 NHGIS place point data
use nghis_gis_placepoint_15.dta //raw 2015 place point data obtained from IPUMS NHGIS (https://www.nhgis.org)
*29,575 places

codebook gisjoin

gen placefips_short = substr(gisjoin,5,5) 
gen statefips = substr(gisjoin,2,2) 

gen placefips= statefips + placefips_short
codebook placefips
save nghis_gis_placepoint_15_placefips.dta, replace
clear


******2010 NHGIS place point data
use nghis_gis_placepoint_10.dta //raw 2010 place point data obtained from IPUMS NHGIS (https://www.nhgis.org)
*29,514 places

codebook gisjoin

gen placefips_short = substr(gisjoin,5,5) 
gen statefips = substr(gisjoin,2,2) 

gen placefips= statefips + placefips_short
codebook placefips
save nghis_gis_placepoint_10_placefips.dta, replace
clear


******2000 NHGIS place point data

use nghis_gis_placepoint_00.dta //raw 2000 place point data obtained from IPUMS NHGIS (https://www.nhgis.org)
*25,150 places

codebook gisjoin

gen placefips_short = substr(gisjoin,5,5) 
gen statefips = substr(gisjoin,2,2) 

gen placefips= statefips + placefips_short
codebook placefips

save nghis_gis_placepoint_00_placefips.dta, replace
clear


******1990
use nghis_gis_placepoint_90.dta //raw 1990 place point data obtained from IPUMS NHGIS (https://www.nhgis.org)
*23,435 places

codebook gisjoin

gen placefips_short = substr(gisjoin,5,5) 
gen statefips = substr(gisjoin,2,2) 

gen placefips= statefips + placefips_short
codebook placefips

save nghis_gis_placepoint_90_placefips.dta, replace
clear

*Note: 1980 not used b/c it does not provide any new information (e.g., any new matching NHGSIS codes).

****** Merge into BAS data
use bas_dec_incorp_1987_2019.dta

merge 1:1 placefips using nghis_gis_placepoint_15_placefips.dta
*565 matched 
*69 not matched from master
drop if _merge==2
drop _merge

merge 1:1 placefips using nghis_gis_placepoint_10_placefips.dta, replace update
*546 matched 
*88 not matched from master
codebook nhgisplace
*still missing nhgisplace for 67 obs
drop if _merge==2
drop _merge

merge 1:1 placefips using nghis_gis_placepoint_00_placefips.dta, replace update
*424 matched
*210 not matched from master
codebook nhgisplace
*still missing nhgisplace for 55 obvs
drop if _merge==2 & year_incorp==.
drop _merge

merge 1:1 placefips using nghis_gis_placepoint_90_placefips.dta, replace update
*199 matched
*435 not matched from master
codebook nhgisplace
*still missing nhgisplace for 48 obs
drop if _merge==2 & year_incorp==.
drop _merge

sort year_incorp
list place_name_bas state_name placefips year_incorp state_name if nhgisplace==""
tab year_incorp if  nhgisplace==""
*one or two missing most years starting in 1991. 

/*
Decision: drop because the BAS place codes do not match census codes. May be mistake or 
geography other than place (sometimes counties and other entitities end up in BAS).
*/

drop if nhgisplace==""

save bas_dec_incorp_1987_2019_nhgiscode.dta,replace










